Data.Table - 6/2
library(data.table)
## Warning: package 'data.table' was built under R version 3.3.2
setwd("~/Documents/SDAL/R/R Practice/")
# download.file("http://download.geonames.org/export/zip/GB_full.csv.zip", "GB_full.csv.zip")
dt <- fread(unzip(zipfile = "~/Documents/SDAL/R/R Practice/GB_full.csv.zip"))
##
Read 44.2% of 1720673 rows
Read 74.4% of 1720673 rows
Read 1720673 rows and 12 (of 12) columns from 0.214 GB file in 00:00:04
names(dt)
## [1] "V1" "V2" "V3" "V4" "V5" "V6" "V7" "V8" "V9" "V10" "V11"
## [12] "V12"
# subset columns
dt_subset_col <- dt[ , .(V2, V3, V4)] # instead of c(), do .()
head(dt_subset_col)
## V2 V3 V4
## 1: AB10 Aberdeen Scotland
## 2: AB10 1AB George St/Harbour Ward Scotland
## 3: AB10 1AF George St/Harbour Ward Scotland
## 4: AB10 1AG George St/Harbour Ward Scotland
## 5: AB10 1AH George St/Harbour Ward Scotland
## 6: AB10 1AL George St/Harbour Ward Scotland
class(dt_subset_col)
## [1] "data.table" "data.frame"
df_subset <- as.data.frame(dt_subset_col)
rm("df_subset", "dt_subset_col")
# subset rows
dt_subset_row <- dt[V4 == "England", ]
head(dt_subset_row)
## V1 V2 V3 V4 V5 V6 V7
## 1: GB AL1 St Albans England ENG Hertfordshire HRT
## 2: GB AL10 Hatfield England ENG Hertfordshire HRT
## 3: GB AL10 0AA Hatfield England ENG Hertfordshire County E10000015
## 4: GB AL10 0AB Hatfield England ENG Hertfordshire County E10000015
## 5: GB AL10 0AD Hatfield England ENG Hertfordshire County E10000015
## 6: GB AL10 0AE Hatfield England ENG Hertfordshire County E10000015
## V8 V9 V10 V11 V12
## 1: NA NA 4
## 2: NA NA 4
## 3: Welwyn Hatfield District (B) E07000241 51.76426 -0.2262538 6
## 4: Welwyn Hatfield District (B) E07000241 51.77312 -0.2233410 6
## 5: Welwyn Hatfield District (B) E07000241 51.77348 -0.2187323 6
## 6: Welwyn Hatfield District (B) E07000241 51.77302 -0.2255192 6
# sort datatable by column
dt[order(V4)] # ascending
## V1 V2 V3 V4 V5
## 1: GB GY1 St Peter Port
## 2: GB GY10 Sark
## 3: GB GY3 Vale
## 4: GB GY4 St Martin
## 5: GB GY5 Castel
## ---
## 1720669: GB SY9 Cefn Einion Wales WLS
## 1720670: GB SY9 Lea Wales WLS
## 1720671: GB SY9 Bryn Wales WLS
## 1720672: GB SY9 Linley Wales WLS
## 1720673: GB SY9 5JP Churchstoke Community Wales WLS
## V6 V7 V8 V9
## 1: Guernsey Channel Islands
## 2: Channel Islands
## 3: Guernsey Channel Islands
## 4: Guernsey Channel Islands
## 5: Guernsey Channel Islands
## ---
## 1720669: Shropshire SHR
## 1720670: Shropshire SHR
## 1720671: Shropshire SHR
## 1720672: Shropshire SHR
## 1720673: Powys - Powys W06000023 Powys - Powys W06000023
## V10 V11 V12
## 1: NA NA 4
## 2: NA NA 4
## 3: NA NA 1
## 4: NA NA 4
## 5: NA NA 4
## ---
## 1720669: NA NA 3
## 1720670: NA NA 4
## 1720671: NA NA 4
## 1720672: NA NA 4
## 1720673: 52.50256 -3.054109 6
dt[order(-V4, V3)] # descending V4, ascending V3
## V1 V2 V3 V4 V5
## 1: GB LD1 6PG Abbey Cwmhir Community Wales WLS
## 2: GB LD1 6PH Abbey Cwmhir Community Wales WLS
## 3: GB LD1 6PL Abbey Cwmhir Community Wales WLS
## 4: GB LD1 6PN Abbey Cwmhir Community Wales WLS
## 5: GB LD1 6PP Abbey Cwmhir Community Wales WLS
## ---
## 1720669: GB IM7 The Cronk
## 1720670: GB IM9 The Howe
## 1720671: GB IM7 The Lhen
## 1720672: GB IM4 Union Mills
## 1720673: GB GY3 Vale
## V6 V7 V8 V9
## 1: Powys - Powys W06000023 Powys - Powys W06000023
## 2: Powys - Powys W06000023 Powys - Powys W06000023
## 3: Powys - Powys W06000023 Powys - Powys W06000023
## 4: Powys - Powys W06000023 Powys - Powys W06000023
## 5: Powys - Powys W06000023 Powys - Powys W06000023
## ---
## 1720669: Isle of Man IOM
## 1720670: Isle of Man IOM
## 1720671: Isle of Man IOM
## 1720672: Isle of Man IOM
## 1720673: Guernsey Channel Islands
## V10 V11 V12
## 1: 52.30944 -3.353440 6
## 2: 52.33015 -3.388163 6
## 3: 52.32855 -3.395935 6
## 4: 52.32428 -3.411208 6
## 5: 52.35929 -3.411780 6
## ---
## 1720669: NA NA 3
## 1720670: NA NA 3
## 1720671: NA NA 3
## 1720672: NA NA 4
## 1720673: NA NA 1
# adding new columns
# instead of df$new_col <- df$var1 + df$var2 ....
dt[, new_col := V10 + V11] # use := insead of <-
names(dt)
## [1] "V1" "V2" "V3" "V4" "V5" "V6" "V7"
## [8] "V8" "V9" "V10" "V11" "V12" "new_col"
# re-code a variable
dt[V8 == "Aberdeen City", V8 := 'Abr City']
head(dt[, V8])
## [1] "" "Abr City" "Abr City" "Abr City" "Abr City" "Abr City"
# deleting columns
dt[, c('V6', 'V7') := NULL] # back to using c()....
# using built-in functions
head(dt)
## V1 V2 V3 V4 V5 V8 V9
## 1: GB AB10 Aberdeen Scotland SCT
## 2: GB AB10 1AB George St/Harbour Ward Scotland SCT Abr City S12000033
## 3: GB AB10 1AF George St/Harbour Ward Scotland SCT Abr City S12000033
## 4: GB AB10 1AG George St/Harbour Ward Scotland SCT Abr City S12000033
## 5: GB AB10 1AH George St/Harbour Ward Scotland SCT Abr City S12000033
## 6: GB AB10 1AL George St/Harbour Ward Scotland SCT Abr City S12000033
## V10 V11 V12 new_col
## 1: NA NA 4 NA
## 2: 57.14961 -2.096916 6 55.05269
## 3: 57.14871 -2.097806 6 55.05090
## 4: 57.14907 -2.096997 6 55.05207
## 5: 57.14808 -2.094664 6 55.05342
## 6: 57.14954 -2.095412 6 55.05413
dt[, .(average = mean(V10, na.rm = TRUE)), by = V4] # store mean(V10) for each V4 in new column called 'average'
## V4 average
## 1: Scotland 56.19868
## 2: England 52.36342
## 3: Northern Ireland NaN
## 4: Wales 52.11765
## 5: NaN
# melting
aqdt <- as.data.table(airquality)
aqdt_melt <- data.table::melt(data = aqdt,
id.vars = c("Month", "Day"),
measure.vars = c("Ozone", "Solar.R", "Wind", "Temp"),
variable.name = 'measurement',
value.name = 'reading')
## Warning in melt.data.table(data = aqdt, id.vars = c("Month", "Day"),
## measure.vars = c("Ozone", : 'measure.vars' [Ozone, Solar.R, Wind, Temp]
## are not all of the same type. By order of hierarchy, the molten data value
## column will be of type 'double'. All measure variables not of type 'double'
## will be coerced to. Check DETAILS in ?melt.data.table for more on coercion.
dim(aqdt)
## [1] 153 6
dim(aqdt_melt)
## [1] 612 4
# casting
data.table::dcast(data = aqdt_melt, Month + Day ~ measurement) # this 'undoes' the melt process above
## Using 'reading' as value column. Use 'value.var' to override
## Month Day Ozone Solar.R Wind Temp
## 1: 5 1 41 190 7.4 67
## 2: 5 2 36 118 8.0 72
## 3: 5 3 12 149 12.6 74
## 4: 5 4 18 313 11.5 62
## 5: 5 5 NA NA 14.3 56
## ---
## 149: 9 26 30 193 6.9 70
## 150: 9 27 NA 145 13.2 77
## 151: 9 28 14 191 14.3 75
## 152: 9 29 18 131 8.0 76
## 153: 9 30 20 223 11.5 68
Modeling
devtools::install_github('bi-sdal/sdalr', force = TRUE)
## Downloading GitHub repo bi-sdal/sdalr@master
## from URL https://api.github.com/repos/bi-sdal/sdalr/zipball/master
## Installing sdalr
## Installing DBI
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file \
## --no-environ --no-save --no-restore --quiet CMD INSTALL \
## '/private/var/folders/pn/6vp0b7x51x59kxvnb8dl_t1m0000gn/T/RtmpMH43Wg/devtools2901f6539a1/DBI' \
## --library='/Library/Frameworks/R.framework/Versions/3.3/Resources/library' \
## --install-tests
##
## Installing maps
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file \
## --no-environ --no-save --no-restore --quiet CMD INSTALL \
## '/private/var/folders/pn/6vp0b7x51x59kxvnb8dl_t1m0000gn/T/RtmpMH43Wg/devtools2904955fdcf/maps' \
## --library='/Library/Frameworks/R.framework/Versions/3.3/Resources/library' \
## --install-tests
##
## Installing tmaptools
## Installing classInt
## Installing e1071
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file \
## --no-environ --no-save --no-restore --quiet CMD INSTALL \
## '/private/var/folders/pn/6vp0b7x51x59kxvnb8dl_t1m0000gn/T/RtmpMH43Wg/devtools29022688908/e1071' \
## --library='/Library/Frameworks/R.framework/Versions/3.3/Resources/library' \
## --install-tests
##
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file \
## --no-environ --no-save --no-restore --quiet CMD INSTALL \
## '/private/var/folders/pn/6vp0b7x51x59kxvnb8dl_t1m0000gn/T/RtmpMH43Wg/devtools2902c394c9e/classInt' \
## --library='/Library/Frameworks/R.framework/Versions/3.3/Resources/library' \
## --install-tests
##
## Installing osmar
## Installing XML
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file \
## --no-environ --no-save --no-restore --quiet CMD INSTALL \
## '/private/var/folders/pn/6vp0b7x51x59kxvnb8dl_t1m0000gn/T/RtmpMH43Wg/devtools29041493f0/XML' \
## --library='/Library/Frameworks/R.framework/Versions/3.3/Resources/library' \
## --install-tests
##
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file \
## --no-environ --no-save --no-restore --quiet CMD INSTALL \
## '/private/var/folders/pn/6vp0b7x51x59kxvnb8dl_t1m0000gn/T/RtmpMH43Wg/devtools29075654e78/osmar' \
## --library='/Library/Frameworks/R.framework/Versions/3.3/Resources/library' \
## --install-tests
##
## Installing rgdal
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file \
## --no-environ --no-save --no-restore --quiet CMD INSTALL \
## '/private/var/folders/pn/6vp0b7x51x59kxvnb8dl_t1m0000gn/T/RtmpMH43Wg/devtools290c7bf052/rgdal' \
## --library='/Library/Frameworks/R.framework/Versions/3.3/Resources/library' \
## --install-tests
##
## Installing rgeos
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file \
## --no-environ --no-save --no-restore --quiet CMD INSTALL \
## '/private/var/folders/pn/6vp0b7x51x59kxvnb8dl_t1m0000gn/T/RtmpMH43Wg/devtools2906e92abcc/rgeos' \
## --library='/Library/Frameworks/R.framework/Versions/3.3/Resources/library' \
## --install-tests
##
## Installing rmapshaper
## Installing geojsonio
## Installing httr
## Installing curl
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file \
## --no-environ --no-save --no-restore --quiet CMD INSTALL \
## '/private/var/folders/pn/6vp0b7x51x59kxvnb8dl_t1m0000gn/T/RtmpMH43Wg/devtools2901f35cd86/curl' \
## --library='/Library/Frameworks/R.framework/Versions/3.3/Resources/library' \
## --install-tests
##
## Installing mime
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file \
## --no-environ --no-save --no-restore --quiet CMD INSTALL \
## '/private/var/folders/pn/6vp0b7x51x59kxvnb8dl_t1m0000gn/T/RtmpMH43Wg/devtools2905d1061cb/mime' \
## --library='/Library/Frameworks/R.framework/Versions/3.3/Resources/library' \
## --install-tests
##
## Installing openssl
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file \
## --no-environ --no-save --no-restore --quiet CMD INSTALL \
## '/private/var/folders/pn/6vp0b7x51x59kxvnb8dl_t1m0000gn/T/RtmpMH43Wg/devtools29033061d83/openssl' \
## --library='/Library/Frameworks/R.framework/Versions/3.3/Resources/library' \
## --install-tests
##
## Installing R6
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file \
## --no-environ --no-save --no-restore --quiet CMD INSTALL \
## '/private/var/folders/pn/6vp0b7x51x59kxvnb8dl_t1m0000gn/T/RtmpMH43Wg/devtools2903b35d5d2/R6' \
## --library='/Library/Frameworks/R.framework/Versions/3.3/Resources/library' \
## --install-tests
##
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file \
## --no-environ --no-save --no-restore --quiet CMD INSTALL \
## '/private/var/folders/pn/6vp0b7x51x59kxvnb8dl_t1m0000gn/T/RtmpMH43Wg/devtools29043ae0917/httr' \
## --library='/Library/Frameworks/R.framework/Versions/3.3/Resources/library' \
## --install-tests
##
## Installing readr
## Installing BH
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file \
## --no-environ --no-save --no-restore --quiet CMD INSTALL \
## '/private/var/folders/pn/6vp0b7x51x59kxvnb8dl_t1m0000gn/T/RtmpMH43Wg/devtools2902206d290/BH' \
## --library='/Library/Frameworks/R.framework/Versions/3.3/Resources/library' \
## --install-tests
##
## Installing hms
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file \
## --no-environ --no-save --no-restore --quiet CMD INSTALL \
## '/private/var/folders/pn/6vp0b7x51x59kxvnb8dl_t1m0000gn/T/RtmpMH43Wg/devtools290be72db0/hms' \
## --library='/Library/Frameworks/R.framework/Versions/3.3/Resources/library' \
## --install-tests
##
## Installing Rcpp
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file \
## --no-environ --no-save --no-restore --quiet CMD INSTALL \
## '/private/var/folders/pn/6vp0b7x51x59kxvnb8dl_t1m0000gn/T/RtmpMH43Wg/devtools2901b76710a/Rcpp' \
## --library='/Library/Frameworks/R.framework/Versions/3.3/Resources/library' \
## --install-tests
##
## Installing tibble
## Installing rlang
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file \
## --no-environ --no-save --no-restore --quiet CMD INSTALL \
## '/private/var/folders/pn/6vp0b7x51x59kxvnb8dl_t1m0000gn/T/RtmpMH43Wg/devtools290150a3508/rlang' \
## --library='/Library/Frameworks/R.framework/Versions/3.3/Resources/library' \
## --install-tests
##
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file \
## --no-environ --no-save --no-restore --quiet CMD INSTALL \
## '/private/var/folders/pn/6vp0b7x51x59kxvnb8dl_t1m0000gn/T/RtmpMH43Wg/devtools2907e3ec6ac/tibble' \
## --library='/Library/Frameworks/R.framework/Versions/3.3/Resources/library' \
## --install-tests
##
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file \
## --no-environ --no-save --no-restore --quiet CMD INSTALL \
## '/private/var/folders/pn/6vp0b7x51x59kxvnb8dl_t1m0000gn/T/RtmpMH43Wg/devtools2902226ae4c/readr' \
## --library='/Library/Frameworks/R.framework/Versions/3.3/Resources/library' \
## --install-tests
##
## Installing V8
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file \
## --no-environ --no-save --no-restore --quiet CMD INSTALL \
## '/private/var/folders/pn/6vp0b7x51x59kxvnb8dl_t1m0000gn/T/RtmpMH43Wg/devtools29029e62c8f/V8' \
## --library='/Library/Frameworks/R.framework/Versions/3.3/Resources/library' \
## --install-tests
##
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file \
## --no-environ --no-save --no-restore --quiet CMD INSTALL \
## '/private/var/folders/pn/6vp0b7x51x59kxvnb8dl_t1m0000gn/T/RtmpMH43Wg/devtools29037dff0d3/geojsonio' \
## --library='/Library/Frameworks/R.framework/Versions/3.3/Resources/library' \
## --install-tests
##
## Installing geojsonlint
## Installing jsonvalidate
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file \
## --no-environ --no-save --no-restore --quiet CMD INSTALL \
## '/private/var/folders/pn/6vp0b7x51x59kxvnb8dl_t1m0000gn/T/RtmpMH43Wg/devtools290764b2dde/jsonvalidate' \
## --library='/Library/Frameworks/R.framework/Versions/3.3/Resources/library' \
## --install-tests
##
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file \
## --no-environ --no-save --no-restore --quiet CMD INSTALL \
## '/private/var/folders/pn/6vp0b7x51x59kxvnb8dl_t1m0000gn/T/RtmpMH43Wg/devtools29043ed9a4a/geojsonlint' \
## --library='/Library/Frameworks/R.framework/Versions/3.3/Resources/library' \
## --install-tests
##
## Skipping install of 'readr' from a cran remote, the SHA1 (1.1.1) has not changed since last install.
## Use `force = TRUE` to force installation
## Skipping install of 'V8' from a cran remote, the SHA1 (1.5) has not changed since last install.
## Use `force = TRUE` to force installation
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file \
## --no-environ --no-save --no-restore --quiet CMD INSTALL \
## '/private/var/folders/pn/6vp0b7x51x59kxvnb8dl_t1m0000gn/T/RtmpMH43Wg/devtools29068215bc7/rmapshaper' \
## --library='/Library/Frameworks/R.framework/Versions/3.3/Resources/library' \
## --install-tests
##
## Installing spdep
## Installing coda
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file \
## --no-environ --no-save --no-restore --quiet CMD INSTALL \
## '/private/var/folders/pn/6vp0b7x51x59kxvnb8dl_t1m0000gn/T/RtmpMH43Wg/devtools29064e5a92f/coda' \
## --library='/Library/Frameworks/R.framework/Versions/3.3/Resources/library' \
## --install-tests
##
## Installing deldir
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file \
## --no-environ --no-save --no-restore --quiet CMD INSTALL \
## '/private/var/folders/pn/6vp0b7x51x59kxvnb8dl_t1m0000gn/T/RtmpMH43Wg/devtools290312e0452/deldir' \
## --library='/Library/Frameworks/R.framework/Versions/3.3/Resources/library' \
## --install-tests
##
## Installing expm
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file \
## --no-environ --no-save --no-restore --quiet CMD INSTALL \
## '/private/var/folders/pn/6vp0b7x51x59kxvnb8dl_t1m0000gn/T/RtmpMH43Wg/devtools29016afd6f3/expm' \
## --library='/Library/Frameworks/R.framework/Versions/3.3/Resources/library' \
## --install-tests
##
## Installing gmodels
## Installing gdata
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file \
## --no-environ --no-save --no-restore --quiet CMD INSTALL \
## '/private/var/folders/pn/6vp0b7x51x59kxvnb8dl_t1m0000gn/T/RtmpMH43Wg/devtools290191489e2/gdata' \
## --library='/Library/Frameworks/R.framework/Versions/3.3/Resources/library' \
## --install-tests
##
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file \
## --no-environ --no-save --no-restore --quiet CMD INSTALL \
## '/private/var/folders/pn/6vp0b7x51x59kxvnb8dl_t1m0000gn/T/RtmpMH43Wg/devtools2903851e739/gmodels' \
## --library='/Library/Frameworks/R.framework/Versions/3.3/Resources/library' \
## --install-tests
##
## Installing LearnBayes
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file \
## --no-environ --no-save --no-restore --quiet CMD INSTALL \
## '/private/var/folders/pn/6vp0b7x51x59kxvnb8dl_t1m0000gn/T/RtmpMH43Wg/devtools290728e1d36/LearnBayes' \
## --library='/Library/Frameworks/R.framework/Versions/3.3/Resources/library' \
## --install-tests
##
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file \
## --no-environ --no-save --no-restore --quiet CMD INSTALL \
## '/private/var/folders/pn/6vp0b7x51x59kxvnb8dl_t1m0000gn/T/RtmpMH43Wg/devtools2902b9f2ace/spdep' \
## --library='/Library/Frameworks/R.framework/Versions/3.3/Resources/library' \
## --install-tests
##
## Skipping install of 'XML' from a cran remote, the SHA1 (3.98-1.9) has not changed since last install.
## Use `force = TRUE` to force installation
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file \
## --no-environ --no-save --no-restore --quiet CMD INSTALL \
## '/private/var/folders/pn/6vp0b7x51x59kxvnb8dl_t1m0000gn/T/RtmpMH43Wg/devtools29036cdf2cb/tmaptools' \
## --library='/Library/Frameworks/R.framework/Versions/3.3/Resources/library' \
## --install-tests
##
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file \
## --no-environ --no-save --no-restore --quiet CMD INSTALL \
## '/private/var/folders/pn/6vp0b7x51x59kxvnb8dl_t1m0000gn/T/RtmpMH43Wg/devtools2905c0e5153/bi-sdal-sdalr-68ab1ce' \
## --library='/Library/Frameworks/R.framework/Versions/3.3/Resources/library' \
## --install-tests
##
data("anscombe")
summary(anscombe)
## x1 x2 x3 x4
## Min. : 4.0 Min. : 4.0 Min. : 4.0 Min. : 8
## 1st Qu.: 6.5 1st Qu.: 6.5 1st Qu.: 6.5 1st Qu.: 8
## Median : 9.0 Median : 9.0 Median : 9.0 Median : 8
## Mean : 9.0 Mean : 9.0 Mean : 9.0 Mean : 9
## 3rd Qu.:11.5 3rd Qu.:11.5 3rd Qu.:11.5 3rd Qu.: 8
## Max. :14.0 Max. :14.0 Max. :14.0 Max. :19
## y1 y2 y3 y4
## Min. : 4.260 Min. :3.100 Min. : 5.39 Min. : 5.250
## 1st Qu.: 6.315 1st Qu.:6.695 1st Qu.: 6.25 1st Qu.: 6.170
## Median : 7.580 Median :8.140 Median : 7.11 Median : 7.040
## Mean : 7.501 Mean :7.501 Mean : 7.50 Mean : 7.501
## 3rd Qu.: 8.570 3rd Qu.:8.950 3rd Qu.: 7.98 3rd Qu.: 8.190
## Max. :10.840 Max. :9.260 Max. :12.74 Max. :12.500
sapply(1:4, function(x){
cor(anscombe[, x], anscombe[, x+4])})
## [1] 0.8164205 0.8162365 0.8162867 0.8165214
summary(lm(y1 ~ x1, data = anscombe))
##
## Call:
## lm(formula = y1 ~ x1, data = anscombe)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.92127 -0.45577 -0.04136 0.70941 1.83882
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.0001 1.1247 2.667 0.02573 *
## x1 0.5001 0.1179 4.241 0.00217 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.237 on 9 degrees of freedom
## Multiple R-squared: 0.6665, Adjusted R-squared: 0.6295
## F-statistic: 17.99 on 1 and 9 DF, p-value: 0.00217
data('anscombosaurus', package = 'sdalr')
summary(anscombosaurus)
## x y
## Min. :22.31 Min. : 2.949
## 1st Qu.:44.10 1st Qu.:25.288
## Median :53.33 Median :46.026
## Mean :54.26 Mean :47.832
## 3rd Qu.:64.74 3rd Qu.:68.526
## Max. :98.21 Max. :99.487
summary(lm(y~x, data = anscombosaurus))
##
## Call:
## lm(formula = y ~ x, data = anscombosaurus)
##
## Residuals:
## Min 1Q Median 3Q Max
## -43.439 -23.213 -1.296 21.368 52.050
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 53.4530 7.6934 6.948 1.29e-10 ***
## x -0.1036 0.1355 -0.764 0.446
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 26.98 on 140 degrees of freedom
## Multiple R-squared: 0.004157, Adjusted R-squared: -0.002957
## F-statistic: 0.5844 on 1 and 140 DF, p-value: 0.4459
plot(anscombosaurus$x, anscombosaurus$y)

library(ggplot2)
diamonds <- diamonds
quantile(diamonds$price, c(.10, .25, .50, .75))
## 10% 25% 50% 75%
## 646.00 950.00 2401.00 5324.25
economics <- economics
cor(economics$pce, economics$psavert)
## [1] -0.837069
econ_subset <- economics[, c('pce', 'psavert', 'uempmed', 'unemploy')]
library(GGally)
## Warning: package 'GGally' was built under R version 3.3.2
GGally::ggpairs(data = econ_subset)

m <- glm(price ~ carat, data=diamonds)
summary(m)
##
## Call:
## glm(formula = price ~ carat, data = diamonds)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -18585.3 -804.8 -18.9 537.4 12731.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2256.36 13.06 -172.8 <2e-16 ***
## carat 7756.43 14.07 551.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 2398044)
##
## Null deviance: 8.5847e+11 on 53939 degrees of freedom
## Residual deviance: 1.2935e+11 on 53938 degrees of freedom
## AIC: 945467
##
## Number of Fisher Scoring iterations: 2
m <- glm(price ~ carat + table + as.factor(color), data=diamonds)
summary(m)
##
## Call:
## glm(formula = price ~ carat + table + as.factor(color), data = diamonds)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -18514.1 -756.5 -75.4 552.9 12156.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1611.744 163.349 9.867 < 2e-16 ***
## carat 8134.005 14.179 573.662 < 2e-16 ***
## table -76.013 2.868 -26.503 < 2e-16 ***
## as.factor(color).L -1581.839 22.177 -71.327 < 2e-16 ***
## as.factor(color).Q -731.790 20.270 -36.102 < 2e-16 ***
## as.factor(color).C -114.228 19.024 -6.004 1.93e-09 ***
## as.factor(color)^4 72.067 17.471 4.125 3.71e-05 ***
## as.factor(color)^5 -140.016 16.517 -8.477 < 2e-16 ***
## as.factor(color)^6 -174.730 14.983 -11.662 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 2137611)
##
## Null deviance: 8.5847e+11 on 53939 degrees of freedom
## Residual deviance: 1.1528e+11 on 53931 degrees of freedom
## AIC: 939272
##
## Number of Fisher Scoring iterations: 2
library(broom)
## Warning: package 'broom' was built under R version 3.3.2
broom::tidy(m)$estimate
## [1] 1611.74365 8134.00465 -76.01264 -1581.83911 -731.78962 -114.22765
## [7] 72.06672 -140.01590 -174.73000
# broom::augment(m)
broom::glance(m)
## null.deviance df.null logLik AIC BIC deviance
## 1 858473135517 53939 -469626.2 939272.4 939361.3 115283523794
## df.residual
## 1 53931
plot(m)




# do not want to see a pattern in the residual plot
# QQ plot should show straight line (depending on distribution of QQ plot)
mpg_bin <- function(mpg) {
if (mpg > 22) {
return('good')
} else {
return('poor')
}
}
mpg_bin_int <- function(mpg) {
if (mpg > 22) {
return(1)
} else {
return(0)
}
}
mtcars$mpg_bin <- sapply(X = mtcars$mpg, FUN = mpg_bin)
mtcars$mpg_bin_int <- sapply(X = mtcars$mpg, FUN = mpg_bin_int)
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
## mpg_bin mpg_bin_int
## Mazda RX4 poor 0
## Mazda RX4 Wag poor 0
## Datsun 710 good 1
## Hornet 4 Drive poor 0
## Hornet Sportabout poor 0
## Valiant poor 0
m <- glm(formula = mpg_bin_int ~ hp + wt, data = mtcars, family = binomial(link = 'logit'))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(m)
##
## Call:
## glm(formula = mpg_bin_int ~ hp + wt, family = binomial(link = "logit"),
## data = mtcars)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.72029 -0.00913 -0.00001 0.00314 1.40334
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 30.4063 18.0320 1.686 0.0917 .
## hp -0.2201 0.1447 -1.521 0.1283
## wt -3.1801 1.9659 -1.618 0.1057
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 38.0243 on 31 degrees of freedom
## Residual deviance: 6.5068 on 29 degrees of freedom
## AIC: 12.507
##
## Number of Fisher Scoring iterations: 10
results <- broom::tidy(m)
results$or <- exp(results$estimate)
results
## term estimate std.error statistic p.value or
## 1 (Intercept) 30.4062991 18.0320177 1.686239 0.09174969 1.604309e+13
## 2 hp -0.2200611 0.1446913 -1.520901 0.12828471 8.024697e-01
## 3 wt -3.1801489 1.9658545 -1.617693 0.10572880 4.157947e-02
library(survival)
head(heart) # about people on waiting list for Harvard heart transplant
## start stop event age year surgery transplant id
## 1 0 50 1 -17.155373 0.1232033 0 0 1
## 2 0 6 1 3.835729 0.2546201 0 0 2
## 3 0 1 0 6.297057 0.2655715 0 0 3
## 4 1 16 1 6.297057 0.2655715 0 1 3
## 5 0 36 0 -7.737166 0.4900753 0 0 4
## 6 36 39 1 -7.737166 0.4900753 0 1 4
cox_model <- coxph(Surv(heart$start, heart$stop, heart$event) ~ age + year + as.factor(surgery) + as.factor(transplant), data = heart)
summary(cox_model)
## Call:
## coxph(formula = Surv(heart$start, heart$stop, heart$event) ~
## age + year + as.factor(surgery) + as.factor(transplant),
## data = heart)
##
## n= 172, number of events= 75
##
## coef exp(coef) se(coef) z Pr(>|z|)
## age 0.02717 1.02754 0.01371 1.981 0.0476 *
## year -0.14635 0.86386 0.07047 -2.077 0.0378 *
## as.factor(surgery)1 -0.63721 0.52877 0.36723 -1.735 0.0827 .
## as.factor(transplant)1 -0.01025 0.98980 0.31375 -0.033 0.9739
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## age 1.0275 0.9732 1.0003 1.0555
## year 0.8639 1.1576 0.7524 0.9918
## as.factor(surgery)1 0.5288 1.8912 0.2574 1.0860
## as.factor(transplant)1 0.9898 1.0103 0.5352 1.8307
##
## Concordance= 0.636 (se = 0.037 )
## Rsquare= 0.084 (max possible= 0.969 )
## Likelihood ratio test= 15.11 on 4 df, p=0.004476
## Wald test = 14.49 on 4 df, p=0.005877
## Score (logrank) test = 15.03 on 4 df, p=0.004631
plot(survfit(cox_model)) # kaplan meyer curve (everyone is alive on day zero, as the number of days increase, the solid line shows you how many people are left in your population)

m1 <- glm(price ~ carat, data = diamonds)
m2 <- glm(price ~ carat + as.factor(cut), data = diamonds)
m3 <- glm(price ~ carat + as.factor(cut) + depth, data = diamonds)
m4 <- glm(price ~ carat + as.factor(cut) + depth + table, data = diamonds)
# ways of comparing models
anova(m1, m2, m3, m4)
## Analysis of Deviance Table
##
## Model 1: price ~ carat
## Model 2: price ~ carat + as.factor(cut)
## Model 3: price ~ carat + as.factor(cut) + depth
## Model 4: price ~ carat + as.factor(cut) + depth + table
## Resid. Df Resid. Dev Df Deviance
## 1 53938 1.2935e+11
## 2 53934 1.2321e+11 4 6133201436
## 3 53933 1.2297e+11 1 246635400
## 4 53932 1.2270e+11 1 264127832
# AIC and BIC put penalty on every new variable you add to a model
# looking for the smallest number to identify best model
AIC(m1, m2, m3, m4)
## df AIC
## m1 3 945466.5
## m2 7 942854.2
## m3 8 942748.1
## m4 9 942634.2
BIC(m1, m2, m3, m4)
## df BIC
## m1 3 945493.2
## m2 7 942916.5
## m3 8 942819.3
## m4 9 942714.2
# cross-validation (cv) prediction error for generalized linear models (glm)
library(boot)
##
## Attaching package: 'boot'
## The following object is masked from 'package:survival':
##
## aml
cv1 <- cv.glm(data = diamonds, glmfit = m1, K = 5) # K = number of partitions you want your data in (5 and 10 are popular)
cv2 <- cv.glm(data = diamonds, glmfit = m2, K = 5)
cv3 <- cv.glm(data = diamonds, glmfit = m3, K = 5)
cv4 <- cv.glm(data = diamonds, glmfit = m4, K = 5)
# print errors from all cross validations; pick the one with the smallest errors
cv1$delta
## [1] 2398543 2398478
cv2$delta
## [1] 2285631 2285478
cv3$delta
## [1] 2280494 2280403
cv4$delta
## [1] 2275951 2275821
Bayesian Statistics
# commands to run a simple 1-d mcmc.
# number of mcmc scans
Niter <- 1000
# logdens = log density
logdens1 <- function(x){
# returns the log of the standard normal density
# the density is not normalized here
return(-.5*x^2)
}
# build a vector to store the mcmc output
x <- rep(0,Niter)
# initialize x[1] with a starting value
x[1] <- 20.0
# compute the log of the posterior density for x[1]
logpost <- logdens1(x[1])
# width of metropolis proposal
a <- 3 # use a uniform U[-a,a] metropolis proposal (give or take 3)
# carry out the Metropolis sampling
for(i in 2:Niter){
can = x[i-1] + runif(1,-a,a) # generate candidate centered at old value, give or take a
logpostcan <- logdens1(can) # generate the posterior density of that
u <- runif(1) # generate uniform to tell us whether or not we accept this value
if(log(u) < (logpostcan - logpost)){ # if the uniform is < ratio of new/old posterior, then accept (leave current value at the old value)
# accept the proposal
x[i] <- can
logpost <- logpostcan # update the posterior density
} else{
x[i] <- x[i-1] # set the new value to the old value; leave logpost as is.
}
}
# make a plot of the output
# use par to tell R to plot a single image, and set the margin area
par(mfrow=c(1,2),oma=c(0,0,0,0))#,matlabr=c(4,4,1,1))
plot(1:Niter,x,type='l')
hist(x[-(1:100)],main=paste('a =',a,sep=''),xlab='x',probability=T)
lines(seq(-3,3,length=200),dnorm(seq(-3,3,length=200)))

browser()
## Called from: eval(expr, envir, enclos)
# now lets look at what happens if we change a and the starting value.
# it'll be easier to make the mcmc iterations a function
mcmc1d <- function(x0,Niter,a,LOGDENS){
# run a metropolis chain using U(x-a,x+a) proposals
# x0 is the starting value
# Niter is the number of scans in this metropolis chain
# LOGDENS is the log of the density from which we want draws.
# the function returns a vector holding the output from the Metropolis sample
x <- rep(0,Niter)
# initialize x[1] with a starting value
x[1] <- x0
# compute the log of the posterior density for x[1]
logpost <- LOGDENS(x[1])
# carry out the Metropolis sampling
for(i in 2:Niter){
can = x[i-1] + runif(1,-a,a)
logpostcan <- LOGDENS(can)
u <- runif(1)
if(log(u) < (logpostcan - logpost)){
# accept the proposal
x[i] <- can
logpost <- logpostcan
} else{
x[i] <- x[i-1] # set the new value to the old value; leave logpost as is.
}
}
return(x)
}
# first, let's look at the effect of changing the starting value
# we'll look at a=20,10,5,0
# this should motivate ideas about "burn-in": Should we discard
# some of the initial piece of the chain so that our estimates
# are more accurate?
x1 <- mcmc1d(20,1000,3,logdens1)
x2 <- mcmc1d(10,1000,3,logdens1)
x3 <- mcmc1d(5,1000,3,logdens1)
x4 <- mcmc1d(0,1000,3,logdens1)
# look at some plots
par(mfrow=c(2,2),oma=c(0,0,0,0),mar=c(4,4,1,1))
plot(1:Niter,x1,ylim=c(-3,21),type='l')
plot(1:Niter,x2,ylim=c(-3,21),type='l')
plot(1:Niter,x3,ylim=c(-3,21),type='l')
plot(1:Niter,x4,ylim=c(-3,21),type='l')

browser()
# now, let's look at the effect of changing the proposal width a
# we'll look at a=20,10,3,0.1
x1 <- mcmc1d(20,1000,20,logdens1)
x2 <- mcmc1d(20,1000,3,logdens1)
x3 <- mcmc1d(20,1000,1,logdens1)
x4 <- mcmc1d(20,1000,.1,logdens1)
# look at some plots
# different widths - no labels
par(mfrow=c(2,2),oma=c(0,0,0,0),mar=c(4,4,1,1))
plot(1:Niter,x1,ylim=c(-3,21),type='l',xlab='iteration')
plot(1:Niter,x2,ylim=c(-3,21),type='l',xlab='iteration')
plot(1:Niter,x3,ylim=c(-3,21),type='l',xlab='iteration')
plot(1:Niter,x4,ylim=c(-3,21),type='l',xlab='iteration')

browser()
# show the labels
par(mfrow=c(2,2),oma=c(0,0,0,0),mar=c(4,4,1,1))
plot(1:Niter,x1,ylim=c(-3,21),type='l',xlab='iteration')
text(700,17,'a=20')
plot(1:Niter,x2,ylim=c(-3,21),type='l',xlab='iteration')
text(700,17,'a=3')
plot(1:Niter,x3,ylim=c(-3,21),type='l',xlab='iteration')
text(700,17,'a=1')
plot(1:Niter,x4,ylim=c(-3,21),type='l',xlab='iteration')
text(700,17,'a=.1')

# LESSON: a high (bad?) a is good to get you near the distribution quickly, but once you're close to it it doesn't perform as well as a lower a.
browser()
# now, let's look at the effect of changing the density
# we'll look at a mixture of normals
# a question we can look at is what width is best for
# sampling from this density via the metropolis algorithm
#
logdens2 <- function(x){
# returns a mixture of normals
# .6*N(mu=-1,sd=.4) + .4*N(mu=.8,sd=.4)
# the density is logged
dens <- .6*dnorm(x,-1,.4)+.4*dnorm(x,.8,.4)
return(log(dens))
}
logdens3 <- function(x){
# returns a mixture of normals
# .6*N(mu=-1,sd=.4) + .4*U(0,1)
# the density is logged
dens <- .6*dnorm(x,-1,.4)+.4*(x>0)*(x<1)
return(log(dens))
}
logdens4 <- function(x){
# returns a mixture of normals
# .6*U[-1.2,-.2] + .4*dnorm(x,1,.3)
# the density is logged
dens <- .6*(x > -1.2)*(x < -.2) +.4*dnorm(x,1,.3)
return(log(dens))
}
browser()
# a plot of the density
x <- seq(-3,3,length=200)
par(mfrow=c(1,1),oma=c(0,0,0,0),mar=c(4,4,1,1))
plot(x,exp(logdens2(x)),type='l')

browser()
x1 <- mcmc1d(2,1000,20,logdens2)
x2 <- mcmc1d(2,1000,3,logdens2)
x3 <- mcmc1d(2,1000,1,logdens2)
x4 <- mcmc1d(2,1000,.2,logdens2)
# look at some plots
par(mfrow=c(2,2),oma=c(0,0,0,0),mar=c(4,4,1,1))
plot(1:Niter,x1,ylim=c(-3,5),type='l',xlab='iteration')
text(700,4,'a=20')
plot(1:Niter,x2,ylim=c(-3,5),type='l',xlab='iteration')
text(700,4,'a=3')
plot(1:Niter,x3,ylim=c(-3,5),type='l',xlab='iteration')
text(700,4,'a=1')
plot(1:Niter,x4,ylim=c(-3,5),type='l',xlab='iteration')
text(700,4,'a=.2')

browser()
par(mfrow=c(2,2),oma=c(0,0,0,0),mar=c(4,4,1,1))
hist(x1[-(1:100)],probability=TRUE,ylim=c(0,.75),xlim=c(-3,3),xlab='x',
main='a=20')
lines(x,exp(logdens2(x)))
hist(x2[-(1:100)],probability=TRUE,ylim=c(0,.75),xlim=c(-3,3),xlab='x',
main='a=3')
lines(x,exp(logdens2(x)))
hist(x3[-(1:100)],probability=TRUE,ylim=c(0,.75),xlim=c(-3,3),xlab='x',
main='a=1')
lines(x,exp(logdens2(x)))
hist(x4[-(1:100)],probability=TRUE,ylim=c(0,.75),xlim=c(-3,3),xlab='x',
main='a=.2')
lines(x,exp(logdens2(x)))

browser()
#
# for the students, they can look at the mixture of normals problem. What is
# their estimate for the P(x > 1.5)? What is their choice for proposal width a?
# Did you discard any of the initial part of the chain?
#